suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(RColorBrewer)
library(tidyverse)
library(VennDiagram)
})
suppressMessages({
source(here("UPSCb-common/src/R/featureSelection.R"))
source(here("UPSCb-common/src/R/volcanoPlot.R"))
source(here("UPSCb-common/src/R/gopher.R"))
})
pal=brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
TODO remember to add a function to check for correlation between logFC and transcript length - check for the effective length * Functions 1. plot specific gene expression
"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id,gene_name=character(0)){
message(paste("Plotting",gene_id))
sel <- grepl(gene_id,rownames(vst))
stopifnot(sum(sel)==1)
d <- bind_cols(as.data.frame(colData(dds)),
data.frame(value=vst[sel,]))
p <- ggplot(d,
aes(x=TIME,y=value,col=TREATMENT)) +
facet_wrap(.~TISSUE) +
geom_point() +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id,ifelse(length(gene_name)>0,gene_name,"")))
suppressMessages(suppressWarnings(plot(p)))
p <- ggplot(d,
aes(x=CONDITION,y=value,col=TISSUE,group=TISSUE)) +
geom_point() + geom_smooth() +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id,ifelse(length(gene_name)>0,gene_name,"")))
suppressMessages(suppressWarnings(plot(p)))
return(NULL)
}
"extract_results" <- function(dds,vst,contrast,
padj=0.01,lfc=0.5,
plot=TRUE,verbose=TRUE,
export=TRUE,default_dir=here("data/analysis/DE"),
default_prefix="DE-",
labels=colnames(dds),
sample_sel=1:ncol(dds),
expression_cutoff=0,
debug=FALSE){
if(length(contrast)==1){
res <- results(dds,name=contrast)
} else {
res <- results(dds,contrast=contrast)
}
if(plot){
par(mar=c(5,5,5,5))
volcanoPlot(res)
par(mar=mar)
}
# a look at independent filtering
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
}
if(plot){
plot(metadata(res)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter")
lines(metadata(res)$lo.fit, col="red")
abline(v=metadata(res)$filterTheta)
}
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
round(max.theta*100,digits=5),
round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
}
if(plot){
qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
qtl.exp=qtl.exp,
number.degs=sapply(lapply(qtl.exp,function(qe){
res$padj <= padj & abs(res$log2FoldChange) >= lfc &
! is.na(res$padj) & res$baseMean >= qe
}),sum))
plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red"))
if(debug){
p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_log10("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
plot(ggplot(dat,aes(x=thetas,y=number.degs)) +
geom_line() + geom_point() +
geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes"))
plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Cumulative number of DE genes"))
plot(ggplot(data.frame(x=dat$thetas[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes per interval"))
plot(ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("base mean of expression") +
scale_y_continuous("Number of DE genes per interval"))
}
p <- ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_log10("base mean of expression") +
scale_y_continuous("Number of DE genes per interval") +
geom_vline(xintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
}
sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) & res$baseMean >= expression_cutoff
if(verbose){
message(sprintf("There are %s genes that are DE with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s",
sum(sel),
padj,lfc,expression_cutoff))
}
if(export){
if(!dir.exists(default_dir)){
dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
}
write.csv(res,file=file.path(default_dir,paste0(default_prefix,"results.csv")))
write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"genes.csv")))
}
if(plot){
if(sum(sel) > 1){
heatmap.2(t(scale(t(vst[sel,sample_sel]))),
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method="ward.D2")},
trace="none",col=hpal,labRow = FALSE,
labCol=labels[sample_sel])
} else {
warning("There are not enough DE genes to create a heatmap")
}
}
return(list(all=rownames(res[sel,]),
up=rownames(res[sel & res$log2FoldChange > 0,]),
dn=rownames(res[sel & res$log2FoldChange < 0,])))
}
load(here("data/analysis/salmon/dds.rda"))
vsd <- varianceStabilizingTransformation(dds,blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("data/analysis/DE"),showWarnings=FALSE)
save(vst,file=here("data/analysis/DE/salmon-vst-aware.rda"))
goi <- read_csv(here("doc/Candidates_and_flowering_genes.csv"))
## Parsed with column specification:
## cols(
## Name = col_character(),
## ID = col_character()
## )
stopifnot(all(goi$ID %in% rownames(vst)))
dev.null <- apply(goi,1,function(ro){line_plot(dds=dds,vst=vst,gene_id = ro[2],gene_name = ro[1])})
## Plotting AT2G47180
## Plotting AT1G56600
## Plotting AT1G09350
## Plotting AT1G60470
## Plotting AT1G60450
## Plotting AT3G28340
## Plotting AT1G55740
## Plotting AT3G57520
## Plotting AT4G01970
## Plotting AT5G40390
## Plotting AT5G20250
## Plotting AT3G06580
## Plotting AT1G65480
## Plotting AT4G20370
## Plotting AT1G22770
## Plotting AT2G45660
## Plotting AT4G35900
## Plotting AT5G15840
## Plotting AT5G61850
## Plotting AT1G28130
## Plotting AT4G27260
## Plotting AT5G03840
ddsApex <- dds[,dds$TISSUE=="APEX"]
ddsApex <- cbind(ddsApex,ddsApex[,ddsApex$TREATMENT=="Na"])
ddsApex$TREATMENT[ddsApex$TREATMENT=="Na"] <- rep(c("Mock","Dexa"),each=3)
ddsApex$TREATMENT <- droplevels(ddsApex$TREATMENT)
design(ddsApex) <- ~ TIME * TREATMENT
ddsApex <- DESeq(ddsApex)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Update the vst for convenience
vsdApex <- varianceStabilizingTransformation(ddsApex,blind=FALSE)
vstApex <- assay(vsdApex)
vstApex <- vstApex - min(vstApex)
plotDispEsts(ddsApex)
Check the different contrasts
resultsNames(ddsApex)
## [1] "Intercept" "TIME_T1_vs_T0" "TIME_T3_vs_T0"
## [4] "TREATMENT_Dexa_vs_Mock" "TIMET1.TREATMENTDexa" "TIMET3.TREATMENTDexa"
#’ ```{r res, echo=FALSE,eval=FALSE} CHANGEME - here you need to define the contrast you want to study - see the example in the next block.
The contrast can be given by name, as a list (numerator/denominator) or as a vector of weight (e.g. c(0,1)); read the DESeq2 vignette for more info
The label argument is typically one (or a combination) of the metadata stored in colData
The function allows for looking at the independent filtering results using debug=TRUE
If you are not satisfied with the default from DESeq2, you can set your own cutoff using expression_cutoff
You can change the default output file prefix using default_prefix
You can select the set of samples to be added to the heatmap, using the sample_sel argument. It takes a logical vector.
```
AdexaT1 <- extract_results(ddsApex,vstApex,
contrast = "TIMET1.TREATMENTDexa",
default_prefix = "Salmon-Apex_Dexa-vs-Mock_T1_",
labels = ddsApex$TREATMENT,
sample_sel = ddsApex$TIME=="T1"
)
## Loading required package: LSD
## The independent filtering cutoff is 8.49496, removing 39.0077% of the data
## The independent filtering cutoff is 8.49496, removing 39.0077% of the data
## The independent filtering maximises for 42.03431 % of the data, corresponding to a base mean expression of 16.20359 (library-size normalised read)
## There are 299 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
AdexaT3 <- extract_results(ddsApex,vstApex,
contrast = "TIMET3.TREATMENTDexa",
default_prefix = "Salmon-Apex_Dexa-vs-Mock_T3_",
labels = ddsApex$TREATMENT,
sample_sel = ddsApex$TIME=="T3")
## The independent filtering cutoff is 8.49496, removing 39.0077% of the data
## The independent filtering cutoff is 8.49496, removing 39.0077% of the data
## The independent filtering maximises for 42.03431 % of the data, corresponding to a base mean expression of 16.20359 (library-size normalised read)
## There are 429 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
res.list <- list(AdexaT1=AdexaT1,
AdexaT3=AdexaT3)
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","all"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","up"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","dn"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
ddsLeaf <- dds[,dds$TISSUE=="LEAF"]
ddsLeaf <- cbind(ddsLeaf,ddsLeaf[,ddsLeaf$TREATMENT=="Na"])
ddsLeaf$TREATMENT[ddsLeaf$TREATMENT=="Na"] <- rep(c("Mock","Dexa"),each=3)
ddsLeaf$TREATMENT <- droplevels(ddsLeaf$TREATMENT)
design(ddsLeaf) <- ~ TIME * TREATMENT
ddsLeaf <- DESeq(ddsLeaf)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Update the vst for convenience
vsdLeaf <- varianceStabilizingTransformation(ddsLeaf,blind=FALSE)
vstLeaf <- assay(vsdLeaf)
vstLeaf <- vstLeaf - min(vstLeaf)
plotDispEsts(ddsLeaf)
Check the different contrasts
resultsNames(ddsLeaf)
## [1] "Intercept" "TIME_T1_vs_T0" "TIME_T3_vs_T0"
## [4] "TREATMENT_Dexa_vs_Mock" "TIMET1.TREATMENTDexa" "TIMET3.TREATMENTDexa"
#’ ```{r res, echo=FALSE,eval=FALSE} CHANGEME - here you need to define the contrast you want to study - see the example in the next block.
The contrast can be given by name, as a list (numerator/denominator) or as a vector of weight (e.g. c(0,1)); read the DESeq2 vignette for more info
The label argument is typically one (or a combination) of the metadata stored in colData
The function allows for looking at the independent filtering results using debug=TRUE
If you are not satisfied with the default from DESeq2, you can set your own cutoff using expression_cutoff
You can change the default output file prefix using default_prefix
You can select the set of samples to be added to the heatmap, using the sample_sel argument. It takes a logical vector.
```
LdexaT1 <- extract_results(ddsLeaf,vstLeaf,
contrast = "TIMET1.TREATMENTDexa",
default_prefix = "Salmon-Leaf_Dexa-vs-Mock_T1_",
labels = ddsLeaf$TREATMENT,
sample_sel = ddsLeaf$TIME=="T1")
## The independent filtering cutoff is 0.02989, removing 24.96441% of the data
## The independent filtering cutoff is 0.02989, removing 24.96441% of the data
## The independent filtering maximises for 24.96441 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 3 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
LdexaT3 <- extract_results(ddsLeaf,vstLeaf,
contrast = "TIMET3.TREATMENTDexa",
default_prefix = "Salmon-Leaf_Dexa-vs-Mock_T3_",
labels = ddsLeaf$TREATMENT,
sample_sel = ddsLeaf$TIME=="T3")
## The independent filtering cutoff is 0.02989, removing 24.96441% of the data
## The independent filtering cutoff is 0.02989, removing 24.96441% of the data
## The independent filtering maximises for 24.96441 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 1 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
## Warning in extract_results(ddsLeaf, vstLeaf, contrast =
## "TIMET3.TREATMENTDexa", : There are not enough DE genes to create a heatmap
res.list <- list(LdexaT1=LdexaT1,
LdexaT3=LdexaT3)
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","all"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","up"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","dn"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
res.list <- list(AdexaT1=AdexaT1,
AdexaT3=AdexaT3,
LdexaT1=LdexaT1,
LdexaT3=LdexaT3)
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","all"),
filename=NULL,category.names=names(res.list),fill=pal[1:4]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","up"),
filename=NULL,category.names=names(res.list),fill=pal[1:4]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","dn"),
filename=NULL,category.names=names(res.list),fill=pal[1:4]))
background <- rownames(vst)[featureSelect(vst,dds$CONDITION,exp=0.2)]
stopifnot(all(unlist(res.list) %in% background))
enr.list <- lapply(res.list,function(r){
lapply(r,gopher,background=background,task="go",url="athaliana")
})
## Loading required package: jsonlite
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
## No enrichments found in task: go
## Error: 'genes' needs to be a JSON array of multiple genes
## No enrichments found in task: go
## Error: 'genes' needs to be a JSON array of multiple genes
## Error: 'genes' needs to be a JSON array of multiple genes
## Error: Enrichment selected but no test genes supplied.
dev.null <- lapply(names(enr.list),function(n){
r <- enr.list[[n]]
if(! is.null(r$all$go)){
write_tsv(r$all$go,path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-all-DE-genes_GO-enrichment.tsv")))))
write_tsv(r$all$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-all-DE-genes_GO-enrichment_for-REVIGO.tsv")))))
}
if(! is.null(r$up$go)){
write_tsv(r$up$go,path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-up-DE-genes_GO-enrichment.tsv")))))
write_tsv(r$up$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-up-DE-genes_GO-enrichment_for-REVIGO.tsv")))))
}
if(! is.null(r$dn$go)){
write_tsv(r$dn$go,path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-down-DE-genes_GO-enrichment.tsv")))))
write_tsv(r$dn$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
paste0(n,"Salmon-down-DE-genes_GO-enrichment_for-REVIGO.tsv")))))
}
})
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] jsonlite_1.6.1 LSD_4.0-0
## [3] limma_3.42.2 VennDiagram_1.6.20
## [5] futile.logger_1.4.3 forcats_0.4.0
## [7] stringr_1.4.0 dplyr_0.8.4
## [9] purrr_0.3.3 readr_1.3.1
## [11] tidyr_1.0.2 tibble_2.1.3
## [13] tidyverse_1.3.0 RColorBrewer_1.1-2
## [15] hyperSpec_0.99-20200213 xml2_1.2.2
## [17] ggplot2_3.2.1 lattice_0.20-38
## [19] here_0.1 gplots_3.0.1.2
## [21] DESeq2_1.26.0 SummarizedExperiment_1.16.1
## [23] DelayedArray_0.12.2 BiocParallel_1.20.1
## [25] matrixStats_0.55.0 Biobase_2.46.0
## [27] GenomicRanges_1.38.0 GenomeInfoDb_1.22.0
## [29] IRanges_2.20.2 S4Vectors_0.24.3
## [31] BiocGenerics_0.32.0 data.table_1.12.8
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rprojroot_1.3-2 htmlTable_1.13.3
## [4] XVector_0.26.0 fs_1.3.1 base64enc_0.1-3
## [7] rstudioapi_0.11 farver_2.0.3 bit64_0.9-7
## [10] fansi_0.4.1 AnnotationDbi_1.48.0 lubridate_1.7.4
## [13] splines_3.6.2 geneplotter_1.64.0 knitr_1.28
## [16] Formula_1.2-3 broom_0.5.4 annotate_1.64.0
## [19] cluster_2.1.0 dbplyr_1.4.2 png_0.1-7
## [22] compiler_3.6.2 httr_1.4.1 backports_1.1.5
## [25] assertthat_0.2.1 Matrix_1.2-18 lazyeval_0.2.2
## [28] cli_2.0.1 formatR_1.7 acepack_1.4.1
## [31] htmltools_0.4.0 tools_3.6.2 gtable_0.3.0
## [34] glue_1.3.1 GenomeInfoDbData_1.2.2 Rcpp_1.0.3
## [37] cellranger_1.1.0 vctrs_0.2.2 gdata_2.18.0
## [40] nlme_3.1-144 xfun_0.12 rvest_0.3.5
## [43] testthat_2.3.1 lifecycle_0.1.0 gtools_3.8.1
## [46] XML_3.99-0.3 zlibbioc_1.32.0 scales_1.1.0
## [49] hms_0.5.3 lambda.r_1.2.4 curl_4.3
## [52] yaml_2.2.1 memoise_1.1.0 gridExtra_2.3
## [55] rpart_4.1-15 latticeExtra_0.6-29 stringi_1.4.5
## [58] RSQLite_2.2.0 highr_0.8 genefilter_1.68.0
## [61] checkmate_2.0.0 caTools_1.18.0 rlang_0.4.4
## [64] pkgconfig_2.0.3 bitops_1.0-6 evaluate_0.14
## [67] labeling_0.3 htmlwidgets_1.5.1 bit_1.1-15.2
## [70] tidyselect_1.0.0 magrittr_1.5 R6_2.4.1
## [73] generics_0.0.2 Hmisc_4.3-1 DBI_1.1.0
## [76] pillar_1.4.3 haven_2.2.0 foreign_0.8-75
## [79] withr_2.1.2 survival_3.1-8 RCurl_1.98-1.1
## [82] nnet_7.3-12 modelr_0.1.5 crayon_1.3.4
## [85] futile.options_1.0.1 KernSmooth_2.23-16 rmarkdown_2.1
## [88] jpeg_0.1-8.1 locfit_1.5-9.1 readxl_1.3.1
## [91] blob_1.2.1 reprex_0.3.0 digest_0.6.24
## [94] xtable_1.8-4 munsell_0.5.0